# pip install datasets sympy

import json
from datasets import load_dataset
import random

# 来源1：加载公开数据集
gsm8k = load_dataset("gsm8k", "main", split="train[:500]")  # 前500题
pub_questions = [ex["question"] for ex in gsm8k]

# 来源2：生成合成问题
def generate_geometry_questions(num=100):
    return [f"计算半径为{r}的球体体积" for r in random.sample(range(1,20), num)]

syn_questions = generate_geometry_questions(200)

# 合并并去重
questions = list(set(pub_questions + syn_questions))
random.shuffle(questions)

# 保存问题列表
with open("math_questions.txt", "w") as f:
    f.write("\n".join(questions))
